Regressions

PCA of Volume

Here, we use scikit to determine the PCA of our volume. This will aid us in determining better axes along which to sample our data.


In [3]:
from sklearn.decomposition import PCA
import numpy
import csv

In [4]:
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()

rows = [[int(col) for col in row] for row in reader]

sorted_x = sorted(list(set([r[0] for r in rows])))
sorted_y = sorted(list(set([r[1] for r in rows])))
sorted_z = sorted(list(set([r[2] for r in rows])))

volume = numpy.ndarray((len(sorted_x), len(sorted_y), len(sorted_z)))
for row in rows:
    if row[-1] != 0:
        volume[sorted_x.index(row[0]), sorted_y.index(row[1]), sorted_z.index(row[2])] = row[-1]

In [5]:
pca = PCA(n_components=1)
transform = pca.fit_transform(volume[:,:,0])

Regressions

We run linreg over our data and a test set to compare their outcome (plotted below). We then use sklearn.cross_validation to use $x$ training data to predict the $total-x$ test data.


In [6]:
%matplotlib inline
from sklearn import linear_model

# Split the data into training/testing sets
x_train = sorted_x[:-20]
x_test = sorted_x[-20:]

# Split thxe targets into training/testing sets
y_train = sorted_z[:-20]
y_test = sorted_z[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % numpy.mean((regr.predict(x_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x_test, y_test))

# Plot outputs
plt.scatter(x_test, y_test,  color='black')
plt.plot(x_test, regr.predict(x_test), color='blue',
         linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()


/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-60af08c13081> in <module>()
     14 
     15 # Train the model using the training sets
---> 16 regr.fit(x_train, y_train)
     17 
     18 # The coefficients

/usr/local/lib/python2.7/site-packages/sklearn/linear_model/base.pyc in fit(self, X, y, sample_weight)
    425         n_jobs_ = self.n_jobs
    426         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 427                          y_numeric=True, multi_output=True)
    428 
    429         if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1):

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    511     if multi_output:
    512         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
--> 513                         dtype=None)
    514     else:
    515         y = column_or_1d(y, warn=True)

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              " minimum of %d is required%s."
    406                              % (n_samples, shape_repr, ensure_min_samples,
--> 407                                 context))
    408 
    409     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [ ]: